The first dataset considered is the Steam Video Games Dataset. This dataset is a list of user behaviors, with columns: user-id, game-title, behavior-name, value. The behaviors included are ‘purchase’ and ‘play’. The value indicates the degree to which the behavior was performed - in the case of ‘purchase’ the value is always 1, and in the case of ‘play’ the value represents the number of hours the user has played the game.
raw_data = as_tibble(
read.csv("steam-200k.csv", header=F,
col.names = c( "user-id","game-title", "behavior-name", "value", "unknown")
)
) %>% select(-unknown)
head(raw_data)
line_data =
raw_data %>%
filter(behavior.name == "play") %>%
group_by(game.title) %>%
count() %>%
ungroup() %>%
arrange(desc(n)) %>%
mutate(rnum=row_number())
fig <- plot_ly(line_data, x = ~rnum)
fig <- fig %>% add_lines(y = ~n)
fig <- fig %>% layout(
title = "Most Popular Games",
xaxis = list(
# TODO add buttons
# TODO add game title with mouse over
#rangeselector = list(
# #buttons = list(
# # list(
# # #count = 3,
# # label = "3 mo",
# # #step = 1,
# # #stepmode = "backward"
# # ),
# # #list(step = "all"))),
# # list(label="lol"))),
title = "Games by Popularity",
rangeslider = list(type = "int")
),
yaxis = list(title = "Num. Of Players"))
fig
play_time =
raw_data %>%
filter(behavior.name == "play") %>%
select(-behavior.name) %>%
group_by(user.id) %>%
mutate(total_time = sum(value)) %>%
ungroup() %>%
mutate(perc_time = value/total_time) %>%
select(user.id, game.title, time=value, perc_time, total_time)
games_play_time = play_time %>%
select(-user.id, -time, -perc_time) %>%
group_by(game.title) %>%
summarize(total_time = sum(total_time)) %>%
ungroup() %>%
arrange(desc(total_time)) %>%
mutate(rnum = row_number())
considered_n = 21
to_plot = games_play_time %>% filter(rnum <= considered_n)
to_plot[considered_n,]$game.title = "Others"
to_plot[considered_n,]$total_time =
games_play_time %>% filter(rnum >= considered_n) %>% summarize(total_time=sum(total_time)) %>% pull(total_time)
# TODO fare passaggio con bottone da un grafico all'altro
# TODO aggiungere label in cima a colonne
# TODO si puo' creare una colonna "spezzata" ad indicare che sarebbe molto piu' alta ed in cima ci metti il valore "fuori scala"?
to_plot %>%
ggplot(aes(x=rnum, y=total_time)) +
geom_col()
to_plot %>%
filter(rnum != considered_n) %>%
ggplot(aes(x=rnum, y=total_time)) +
geom_col()
if (nrow(user_play_time) > 10) {
perc_to_show=0.90
to_plot = user_play_time %>%
select(game.title, perc_time, time) %>%
mutate(cum_perc_time=cumsum(perc_time)) %>%
arrange(cum_perc_time) %>%
filter(cum_perc_time <= perc_to_show)
to_plot = to_plot %>%
add_row(game.title="Others",
perc_time=1.0 - max(to_plot$cum_perc_time),
time = (summarize(user_play_time, sum(time)) - summarize(to_plot, sum(time))) %>% pull(),
cum_perc_time=1.0)
} else {
to_plot = user_play_time %>%
select(game.title, perc_time, time) %>%
mutate(cum_perc_time=cumsum(perc_time)) %>%
arrange(cum_perc_time)
}
# Se il giocatore ha giocato a tantissimi giochi distribuendo il tempo
if (nrow(to_plot) > 10) {
too_big_to_plot = to_plot %>%
filter(game.title != "Others") %>%
arrange(desc(perc_time)) %>%
select(-cum_perc_time) %>%
mutate(nrow = row_number())
#too_big_to_plot
to_plot = too_big_to_plot %>%
filter(nrow < 8) %>%
select(game.title, perc_time, time) %>%
mutate(cum_perc_time=cumsum(perc_time)) %>%
arrange(cum_perc_time)
to_plot = to_plot %>%
add_row(game.title="Others",
perc_time=1.0 - max(to_plot$cum_perc_time),
time = (summarize(user_play_time, sum(time)) - summarize(to_plot, sum(time))) %>% pull(),
cum_perc_time=1.0)
#to_plot
}
to_plot = to_plot %>%
#to_plot %>%
mutate(label=scales::percent(perc_time)) %>%
mutate(label=paste(label, paste(time,"h",sep=""), sep="\n")) %>%
select(-cum_perc_time) %>%
arrange(desc(game.title)) %>%
mutate(lab.ypos = cumsum(perc_time) - perc_time/2)
to_plot
# TODO fare trucchetto per ordinare legenda
to_plot %>%
ggplot(aes(x = "", y = perc_time, fill = game.title)) +
geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y") +
#geom_text(aes(y = lab.ypos, label = label), color = "white") +
#geom_text(aes(y = lab.ypos, label = label), color = "black") +
#geom_text(aes(x=rep(1.3, length(lab.ypos)), y = lab.ypos, label = label), color = "black") +
#geom_text(aes(x=1-perc_time*.02, y = lab.ypos, label = label), color = "black") +
#geom_text(aes(x=max(1, 1-perc_time*2), y = lab.ypos, label = label), color = "black") +
#geom_text(aes(x=0.3 + (1-perc_time*.5), y = lab.ypos, label = label), color = "black") +
geom_text(aes(x=0.3 + (1-perc_time*.6), y = lab.ypos, label = label), color = "black") +
#scale_fill_manual(values= brewer.pal(n = 5, name = "RdBu")) +
#scale_fill_manual(values= brewer.pal(n = 5, name = "Dark2")) +
scale_fill_manual(values= brewer.pal(n=nrow(to_plot), name = "Pastel1")) +
#scale_fill_manual(values= brewer.pal(n = 5, name = "Pastel2")) +
labs(
title = paste("Play Time of User", user_play_time %>% pull(user.id)),
fill="Game Titles"
) +
theme_void()
# Per ogni gioco devo contare quanti l'hanno comprato
t1 = raw_data %>%
filter(behavior.name == "purchase") %>%
select(game.title) %>%
group_by(game.title) %>%
count() %>%
ungroup() %>%
arrange(desc(n)) %>%
select(game.title, buy_num=n)
# Per ogni gioco devo contare quanti l'hanno giocato
t2 = raw_data %>%
filter(behavior.name == "play") %>%
select(game.title) %>%
group_by(game.title) %>%
count() %>%
ungroup() %>%
arrange(desc(n)) %>%
select(game.title, play_num=n)
t1 %>%
full_join(t2, by=c("game.title")) %>%
replace_na(list(buy_num=0, play_num=0)) %>%
mutate(buy_no_play=1 - play_num/buy_num) %>%
arrange(desc(buy_no_play))
Alcuni sono DLC! In realta’ sono stati giocati perche’ basta giocare al gioco base Usiamo un altro CSV per capire quali sono veramente giochi Steam games complete dataset
raw_data_2 = as_tibble( read.csv("steam_games.csv") ) %>%
select(-url, -recent_reviews, -all_reviews, -mature_content,
-minimum_requirements, -recommended_requirements, -discount_price) #%>%
# TODO se si riesce a trasformare la data in data bene altrimenti bene uguale
#replace_na(list(release_date = "NA")) %>%
#mutate_at(vars(release_date), ~replace(., is.nan(.), "NA")) %>%
#mutate( across(
# c(release_date),
# function(x) {
# if (is.character(x) && x != "NA" && x != "NaN" ) {
# parse_date(x, "%b %d, %Y",locale=locale("en"))
# } else {
# return(NA)
# }} )
# )
head(raw_data_2)
colnames(raw_data_2)
## [1] "types" "name" "desc_snippet" "release_date"
## [5] "developer" "publisher" "popular_tags" "game_details"
## [9] "languages" "achievements" "genre" "game_description"
## [13] "original_price"
dim(raw_data_2)
## [1] 40833 13
nrow(raw_data_2 %>% distinct(name))
## [1] 40752
Cerchiamo di capire come distinguere DLC dal resto…
raw_data_2 %>%
filter(types == "app") %>%
#filter(grepl("Elder Scrolls",name))
#filter(grepl("DLC",desc_snippet))
#filter(grepl("DLC",game_details))
#filter(grepl("DLC",genre))
#filter(grepl("DLC",game_description))
filter(grepl("DLC",popular_tags))
sembra che ci siano solo giochi (alcuni contengono i DLC, GOTY Edition, etc)
# TODO REMOVE
colnames(raw_data)
## [1] "user.id" "game.title" "behavior.name" "value"
colnames(raw_data_2)
## [1] "types" "name" "desc_snippet" "release_date"
## [5] "developer" "publisher" "popular_tags" "game_details"
## [9] "languages" "achievements" "genre" "game_description"
## [13] "original_price"
games_info_raw =
raw_data %>%
distinct(game.title) %>%
arrange(game.title) %>%
left_join(raw_data_2, by=c("game.title"="name")) %>%
group_by(game.title) %>%
slice(1)
games_info_raw
# TODO REMOVE
#dim(raw_data %>% distinct(game.title))
#dim(games_info)
Ma di quanti abbiamo effettivamente i dati
games_info_raw %>%
filter(!is.na(types))
meno di 2000 Questi giochi appartengono a quanti dei giocatori?
data = raw_data %>%
right_join(
games_info_raw %>%
filter(!is.na(types)) %>%
filter(release_date != "NaN") %>%
filter(release_date != "NA") %>%
select(game.title),
by = "game.title"
)
data
data %>% distinct(user.id)
data %>% distinct(game.title)
data %>% filter(behavior.name == "play")
data %>% filter(behavior.name == "purchase")
quindi lavorerei con oltre 10k persone e circa 2k giochi con oltre 90k interazioni tra user-game (di cui 35k play e 55k purchase) QUESTI SONO I VERI DATI DI PARTENZA
games_info = games_info_raw %>%
right_join(data %>% distinct(game.title), by="game.title")
games_info
write.csv(games_info, "games_info.csv", row.names = F)
users_info =
raw_data %>%
right_join(data %>% select(user.id, game.title, behavior.name), by=c("user.id","game.title","behavior.name"))
users_info
write.csv(users_info, "users_info.csv", row.names = F)
Grafico “scatter” in cui X e’ asse temporale le palle sono i giochi il diametro e’ il (log) numero di giocatori colore potrebbe essere il genere y il numero di ore totali dei giocatori
games_info = read.csv("games_info.csv")
games_info
users_info = read.csv("users_info.csv")
users_info
games_info